In [1]:
filing = EdgarSDFiling.objects.get(pk=711)
docs = filing.edgarsdfilingdocument_set.all()
In [2]:
doc_a = docs[0]
content_a = doc_a.edgardocumentcontent_set.get()
doc_a.description
Out[2]:
In [3]:
doc_b = docs[4]
content_b = doc_b.edgardocumentcontent_set.get()
doc_b.description
Out[3]:
In [4]:
content_a.content[0:200]
Out[4]:
In [5]:
content_b.content[0:200]
Out[5]:
In [98]:
import toolz
from urlextract import URLExtract
extractor = URLExtract()
In [7]:
all_urls = []
for doc in docs:
try:
# Get the doc content
doc_content = doc.edgardocumentcontent_set.get()
except EdgarDocumentContent.DoesNotExist:
continue
content = doc_content.content
if content:
print(doc_content.id)
urls = extractor.find_urls(content)
if urls:
all_urls.extend(urls)
unique_urls = toolz.unique(all_urls)
print(list(unique_urls))
In [8]:
for doc in docs:
print(doc.id)
In [9]:
extract_urls = docs.values_list('edgardocumentcontent__urls', flat=True)
In [10]:
from toolz import filter, accumulate
def compact(iter):
return filter(None, iter)
In [11]:
compacted = list(compact(extract_urls))
compacted
Out[11]:
In [12]:
from itertools import chain
flattened = list(chain.from_iterable(compacted))
flattened
Out[12]:
In [13]:
list(toolz.unique(flattened))
Out[13]:
In [14]:
filing.extracted_urls
Out[14]:
In [16]:
filing = EdgarSDFiling.objects.get(pk=775)
In [19]:
docs = filing.edgarsdfilingdocument_set.all()
docs
Out[19]:
In [30]:
all_urls = []
for doc in docs:
try:
# Get the doc content
doc_content = doc.edgardocumentcontent_set.get()
except EdgarDocumentContent.DoesNotExist:
continue
content = doc_content.content
if content:
extractor = URLExtract()
urls = extractor.find_urls(content)
print(urls)
if urls:
all_urls.extend(urls)
unique_urls = toolz.unique(all_urls)
print(list(unique_urls))
In [107]:
all_urls = []
for doc in docs:
try:
# Get the doc content
doc_content = doc.edgardocumentcontent_set.get()
except EdgarDocumentContent.DoesNotExist:
continue
content = doc_content.content
if content:
extractor = URLExtract()
urls = extractor.find_urls(content.replace('.com.', '.com'))
print(urls)
if urls:
all_urls.extend(urls)
unique_urls = toolz.unique(all_urls)
print(list(unique_urls))
In [ ]: